The attributes of this dataset are: age, gender, heart rate, systolic blood pressure, diastolic blood pressure, blood sugar, ck-mb and troponin with negative or positive output. According to the provided information, the medical dataset classifies either heart attack or none. The gender column in the data is normalized: the male is set to 1 and the female to 0. The glucose column is set to 1 if it is > 120; otherwise, 0. As for the output, positive is set to 1 and negative to 0.
The CK-MB test is a blood test that looks for a specific enzyme. That enzyme, creatine kinase-myocardial band, is most common in your heart but can also mean you have damage to other muscles in your body.
#loading dataset
import pandas as pd
import numpy as np
#visualisation
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#EDA
from collections import Counter
import pandas_profiling as pp
# data preprocessing
from sklearn.preprocessing import StandardScaler
# data splitting
from sklearn.model_selection import train_test_split
# data modeling
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
#ensembling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv(r"C:\MACHINE LEARNING\heart disease\Medicaldataset.csv")
data.head()
data.shape
data.info()
pp.ProfileReport(data)
data.columns
data.describe()
# checking whether data contains null values
data.isnull().sum()
for i in data.columns:
print({i:data[i].unique()})
#to get all unique values
# have converted the categorical data to numerical manually as the data containns only one object variable
#replacing values
data['Result'].replace(['negative', 'positive'],
[0, 1], inplace=True)
# now lets check whether it converted or not
data.dtypes
p = data.Gender.value_counts().plot(kind='bar')
From the above Visualization we can predict that we have greater number of MALE patients than the FEMALE patient
plt.hist(data['Age'])
plt.hist(data['Heart rate'])
plt.hist(data['Systolic blood pressure'])
plt.hist(data['Diastolic blood pressure'])
plt.hist(data['Blood sugar'])
plt.hist(data['CK-MB'])
plt.hist(data['Troponin'])
p = data.Result.value_counts().plot(kind='bar')
data['Result'].value_counts()
# here we can see that the data is unevenly distributed between the two classes
# class 1 has the highest number of patients rather then the class 0
The above visualization it is clearly visible that our dataset is completly imbalanced in fact the number of patients who are suffering from the Heart-attack is half of the patients who are not having Heart-attack
plt.figure(figsize=(12,10))
p = sns.heatmap(data.corr(),annot=True,cmap='Reds')
# create X & Y
X = data.values[:,0:-1]
Y = data.values[:,-1]
print(X.shape)
print(Y.shape)
# now scale the data
from sklearn.preprocessing import StandardScaler
scaler =StandardScaler()
scaler.fit(X)
X=scaler.transform(X)
#X=scaler.fit_transform(X)
print(X)
# it transforms the data into the specific scale so that it makes data points cloder to each other
# now build the model
from sklearn.model_selection import train_test_split
#split the data into test and train
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size=0.3,random_state=10)
from sklearn.linear_model import LogisticRegression
#create a model
classifier=LogisticRegression()
#fitting training data to the model
classifier.fit(X_train,Y_train)
Y_pred=classifier.predict(X_test)
print(Y_pred)
# evaluating the model==>
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
print("Classification report: ")
print(classification_report(Y_test,Y_pred))
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)
#since the value of f1 precision and recall are less for class 1 so tunning is required
# tunning model ->
#store the predicted probablities
y_pred_prob = classifier.predict_proba(X_test)
print(y_pred_prob)
y_pred_class=[]
for value in y_pred_prob[:,1]:
if value > 0.44:
y_pred_class.append(1)
else:
y_pred_class.append(0)
#print(y_pred_class)
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cfm=confusion_matrix(Y_test,y_pred_class)
print(cfm)
print("Classification report: ")
print(classification_report(Y_test,y_pred_class))
acc=accuracy_score(Y_test, y_pred_class)
print("Accuracy of the model: ",acc)
#print(classification_report(Y_test, y_pred_class))
for a in np.arange(0.4,0.61,0.01):
predict_mine = np.where(y_pred_prob[:,1] > a, 1, 0)
cfm=confusion_matrix(Y_test, predict_mine)
total_err=cfm[0,1]+cfm[1,0]
print("Errors at threshold ", a, ":",total_err, " , type 2 error :",
cfm[1,0]," , type 1 error:", cfm[0,1])
# while finalizing a threshold focus upon reducing the total error compare to the base model
# and then finalize that thershold which give you less type 2 error
from sklearn.svm import SVC
svc_model = SVC(kernel = 'rbf',gamma=0.1,C=5)
svc_model.fit(X_train,Y_train)
Y_pred = svc_model.predict(X_test)
print(list(Y_pred))
svc_model.score(X_train,Y_train)
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cfm=confusion_matrix(Y_test,y_pred_class)
print(cfm)
print("Classification report: ")
print(classification_report(Y_test,y_pred_class))
acc=accuracy_score(Y_test, y_pred_class)
print("Accuracy of the model: ",acc)
#print(classification_report(Y_test, y_pred_class))
# predicting using the KNeighbors classifier
from sklearn.neighbors import KNeighborsClassifier
model_KNN = KNeighborsClassifier(n_neighbors = int(np.sqrt(len(X_train))),metric='euclidean')
#euclidean,manhattan,minkowski
#fit the model on the data and predict the values
model_KNN.fit(X_train,Y_train)
Y_pred = model_KNN.predict(X_test)
print(list(zip(Y_test,Y_pred)))
np.sqrt(len(X_train))
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
print("Classification report: ")
print(classification_report(Y_test,Y_pred))
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)
from sklearn.metrics import accuracy_score
my_dict={}
for K in range(1,100):
model_KNN = KNeighborsClassifier(n_neighbors=K,metric="euclidean")
model_KNN.fit(X_train, Y_train)
Y_pred = model_KNN.predict(X_test)
print ("Accuracy is ", accuracy_score(Y_test,Y_pred), "for K-Value:",K)
my_dict[K]=accuracy_score(Y_test,Y_pred)
my_dict
for k in my_dict:
if my_dict[k]==max(my_dict.values()):
print(k,':',my_dict[k])
# predicting using the KNeighbors classifier
from sklearn.neighbors import KNeighborsClassifier
model_KNN = KNeighborsClassifier(n_neighbors = 53,metric='euclidean')
#euclidean,manhattan,minkowski
#fit the model on the data and predict the values
model_KNN.fit(X_train,Y_train)
Y_pred = model_KNN.predict(X_test)
print(list(zip(Y_test,Y_pred)))
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
print("Classification report: ")
print(classification_report(Y_test,Y_pred))
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)
# predicting using the decision tree classifier
from sklearn.tree import DecisionTreeClassifier
model_DecisionTree = DecisionTreeClassifier(criterion ='gini',random_state=10)
#fit the model on the data nad predict thee values
model_DecisionTree.fit(X_train,Y_train)
Y_pred = model_DecisionTree.predict(X_test)
#print(Y_pred)
#print(list(zip(Y_test)))
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
print("Classification report: ")
print(classification_report(Y_test,Y_pred))
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)
from xgboost import XGBClassifier
model_GradientBoosting = XGBClassifier (n_estimators=100,
random_state=10)
#fit the model on the data and predict the values
model_GradientBoosting.fit(X_train,Y_train)
Y_pred =model_GradientBoosting.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
#confusion matrix
print(confusion_matrix(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
from sklearn.ensemble import RandomForestClassifier
model_RandomForest= RandomForestClassifier(n_estimators=100,random_state=10)
# fit the model on the data and predict the values
model_RandomForest.fit(X_train,Y_train)
Y_pred=model_RandomForest.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
#confusion matrix
print(confusion_matrix(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
model_RandomForest.score(X_train,Y_train)
Accuracy_score gained by all the algorithms while building model
Logistic Regression - 0.81313
SVM - 0.79040
KNN - 0.68181
Decision Tree - 0.0.98737
Random Forest - 0.99242
After using all these records,we are able to build machine learninng model(Random forest is the best model with accuracy score - 0.99242) to predict wheather patient is suffering from Heart-Diesease or not
model_RandomForest.feature_importances_
From the above output, It is not much clear that which feature is important for that reason .we will now make a visualization of the same.
sample = pd.DataFrame()
sample ['Columns'] = data.columns[0:-1]
sample ['Results'] = model_RandomForest.feature_importances_
sample.sort_values('Results',ascending=False)
print(sample)
df = pd.DataFrame(sample)
plot = df.plot.bar(x='Columns',y='Results',)
Troponin is a type of protein found in the muscles of your heart. Troponin isn't normally found in the blood. When heart muscles become damaged, troponin is sent into the bloodstream. As heart damage increases, greater amounts of troponin are released in the blood.
Here fron the above visualization ,it is clearly visible that TROPONIN is the feature_importance in the dataset
import pickle
# firstly dump the model with pickle
pickle.dump(model_RandomForest,open("Heart_model1.pkl",'wb'))
# load the saved model
loaded_model = pickle.load(open("Heart_model1.pkl", "rb"))
input_data1 = (21,1,94,98,46,296,6.75,1.06)
input_data_as_numpy_array1 = np.asarray(input_data1)
input_reshape1 = input_data_as_numpy_array1.reshape(1,-1)
prediction1 = loaded_model.predict(input_reshape1)
print(prediction1)
if (prediction1[0]==0):
print("The person does not has Heart ")
else:
print("The person has Heart")
!pip install gradio
import gradio as gr
import fsspec
pip install fsspec
def inference(age, sex, ch, cardio):
s = 0 if sex=='female' else 1
df = pd.DataFrame([[age, s, ch, cardio]],
columns=['Age', 'Sex', 'Cholestoral (in mg/dl)',
'Resting electrocardiographic results'])
df = st.transform(df)
pred = trainedmodel.predict_proba(df)[0]
res = {'No Heart Desease': pred[0], 'Has Heart Desease': pred[1]}
return res
sex = gr.Radio(['female', 'male'], label="Sex")
age = gr.Slider(minimum=1, maximum=100, value=22, label="Age")
ch = gr.Slider(minimum=120, maximum=560, value=200, label="Cholestoral (in mg/dl)")
cardio = gr.Radio([0, 1, 2], label="Resting electrocardiographic results")
gr.Interface(inference, [age, sex, ch, cardio], "label", live=True).launch(share=True) #, debug=True Use in Colab